# Dylan Carlson Sirvent León
# Alex Gazmararian

library(here)
library(readxl)
library(scales)
library(tidyverse)

# Configuration
elections_data_dir <- here("data", "input", "elections_cong")
output_dir <- here("data", "inter", "congressional_elections.csv")

# Data validation functions
validate_raw_data <- function(data, stage_name) {
  message("=== Data Validation: ", stage_name, " ===")
  
  # Basic structure validation
  message("Total rows: ", nrow(data))
  message("Total columns: ", ncol(data))
  
  # Check for missing key variables
  key_vars <- c("year", "state_po", "district", "candidate", "party", "candidatevotes", "totalvotes")
  missing_vars <- setdiff(key_vars, names(data))
  if (length(missing_vars) > 0) {
    message("WARNING: Missing key variables: ", paste(missing_vars, collapse = ", "))
  }
  
  # Check for completely missing data
  if (all(key_vars %in% names(data))) {
    na_counts <- data %>% 
      select(all_of(key_vars)) %>%
      summarise(across(everything(), ~sum(is.na(.))))
    
    for (var in names(na_counts)) {
      if (na_counts[[var]] > 0) {
        message("WARNING: ", na_counts[[var]], " missing values in ", var)
      }
    }
  }
  
  # Check for negative vote counts
  if ("candidatevotes" %in% names(data)) {
    negative_candidate_votes <- sum(data$candidatevotes < 0, na.rm = TRUE)
    if (negative_candidate_votes > 0) {
      message("WARNING: ", negative_candidate_votes, " entries with negative candidate votes")
      negative_entries <- data %>% filter(candidatevotes < 0)
      message("Negative vote entries:")
      print(negative_entries %>% select(year, state_po, district, candidate, party, candidatevotes, totalvotes))
    }
  }
  
  if ("totalvotes" %in% names(data)) {
    negative_total_votes <- sum(data$totalvotes < 0, na.rm = TRUE)
    if (negative_total_votes > 0) {
      message("WARNING: ", negative_total_votes, " entries with negative total votes")
    }
  }
  
  # Check for unrealistic vote counts (candidate votes > total votes)
  if (all(c("candidatevotes", "totalvotes") %in% names(data))) {
    invalid_votes <- data %>% 
      filter(!is.na(candidatevotes), !is.na(totalvotes), candidatevotes > 0, totalvotes > 0) %>%
      filter(candidatevotes > totalvotes)
    
    if (nrow(invalid_votes) > 0) {
      message("WARNING: ", nrow(invalid_votes), " entries where candidate votes > total votes")
      message("Sample invalid entries:")
      print(head(invalid_votes %>% select(year, state_po, district, candidate, candidatevotes, totalvotes)))
    }
  }
  
  message("")
}

validate_party_data <- function(data, stage_name) {
  message("=== Party Validation: ", stage_name, " ===")
  
  # Check party distribution
  if ("party" %in% names(data)) {
    party_counts <- data %>% count(party, sort = TRUE)
    message("Party distribution:")
    print(party_counts)
    
    # Check for unexpected party values
    expected_parties <- c("D", "R")
    unexpected_parties <- setdiff(party_counts$party, expected_parties)
    if (length(unexpected_parties) > 0) {
      message("WARNING: Unexpected party values: ", paste(unexpected_parties, collapse = ", "))
    }
  }
  
  message("")
}

validate_election_coverage <- function(data, stage_name) {
  message("=== Election Coverage Validation: ", stage_name, " ===")
  
  if (all(c("year", "state", "district") %in% names(data))) {
    # Check election coverage by year
    yearly_elections <- data %>% 
      distinct(year, state, district) %>%
      count(year, name = "elections_count")
    message("Elections by year:")
    print(yearly_elections)
    
    # Check for states with unusually few districts
    state_districts <- data %>%
      distinct(year, state, district) %>%
      count(state, name = "district_count") %>%
      arrange(district_count)
    
    message("States with fewest districts (potential issues):")
    print(head(state_districts, 10))
    
    # Check for races with only one candidate
    if ("party" %in% names(data)) {
      single_candidate_races <- data %>%
        group_by(year, state, district) %>%
        summarise(candidate_count = n(), party_count = n_distinct(party), .groups = "drop") %>%
        filter(candidate_count == 1)
      
      if (nrow(single_candidate_races) > 0) {
        message("WARNING: ", nrow(single_candidate_races), " races with only one candidate")
        message("Sample single-candidate races:")
        print(head(single_candidate_races))
      }
      
      # Check for races missing major party representation
      major_party_coverage <- data %>%
        group_by(year, state, district) %>%
        summarise(
          has_democrat = any(party == "D"),
          has_republican = any(party == "R"),
          .groups = "drop"
        ) %>%
        mutate(
          missing_democrat = !has_democrat,
          missing_republican = !has_republican
        )
      
      missing_dem_count <- sum(major_party_coverage$missing_democrat)
      missing_rep_count <- sum(major_party_coverage$missing_republican)
      
      if (missing_dem_count > 0) {
        message("INFO: ", missing_dem_count, " races missing Democratic candidates")
      }
      if (missing_rep_count > 0) {
        message("INFO: ", missing_rep_count, " races missing Republican candidates")
      }
    }
  }
  
  message("")
}

validate_vote_margins <- function(data, stage_name) {
  message("=== Vote Margins Validation: ", stage_name, " ===")
  
  # Check for valid vote shares
  if (all(c("dem_vote_share", "rep_vote_share") %in% names(data))) {
    invalid_shares <- data %>%
      filter(!is.na(dem_vote_share), !is.na(rep_vote_share)) %>%
      filter(dem_vote_share < 0 | rep_vote_share < 0 | 
             dem_vote_share > 100 | rep_vote_share > 100)
    
    if (nrow(invalid_shares) > 0) {
      message("WARNING: ", nrow(invalid_shares), " entries with invalid vote shares (< 0% or > 100%)")
      print(invalid_shares %>% select(election_year, state, district, dem_vote_share, rep_vote_share))
    }
    
    # Check for vote shares that don't sum to ~100%
    vote_share_sums <- data %>%
      filter(!is.na(dem_vote_share), !is.na(rep_vote_share)) %>%
      mutate(total_share = dem_vote_share + rep_vote_share) %>%
      filter(abs(total_share - 100) > 1)  # Allow 1% tolerance
    
    if (nrow(vote_share_sums) > 0) {
      message("WARNING: ", nrow(vote_share_sums), " entries where Dem + Rep vote shares don't sum to ~100%")
      message("Sample entries:")
      print(head(vote_share_sums %>% select(election_year, state, district, dem_vote_share, rep_vote_share, total_share)))
    }
  }
  
  # Check for negative vote totals
  if (all(c("dem_total_votes", "rep_total_votes") %in% names(data))) {
    negative_dem <- sum(data$dem_total_votes < 0, na.rm = TRUE)
    negative_rep <- sum(data$rep_total_votes < 0, na.rm = TRUE)
    
    if (negative_dem > 0) {
      message("WARNING: ", negative_dem, " entries with negative Democratic vote totals")
      negative_dem_entries <- data %>% filter(dem_total_votes < 0)
      print(negative_dem_entries %>% select(election_year, state, district, dem_total_votes, rep_total_votes))
    }
    
    if (negative_rep > 0) {
      message("WARNING: ", negative_rep, " entries with negative Republican vote totals")
      negative_rep_entries <- data %>% filter(rep_total_votes < 0)
      print(negative_rep_entries %>% select(election_year, state, district, dem_total_votes, rep_total_votes))
    }
  }
  
  message("")
}

print_validation_summary <- function() {
  message("=== Data Validation Complete ===")
  message("Review all WARNING messages above for potential data quality issues.")
  message("Consider excluding problematic entries or investigating data sources.")
  message("")
}

# Define helper functions for congressional election data processing
detect_party <- function(party_text, ...) {
  party_keywords <- c(...)
  keyword_pattern <- paste(party_keywords, collapse = "|")
  str_detect(tolower(party_text), keyword_pattern) & !str_detect(tolower(party_text), "write-in")
}

# Load congressional election data (House races 2020-2022)
message("Loading congressional election data...")
congressional_elections_raw <- read_csv(here(elections_data_dir, "1976-2022-house.csv")) |>
  filter(year %in% 2020:2022) |>
  # Create unique congressional district election ID for data joins
  group_by(year, state, district) |>
  mutate(cong_election_id = str_pad(cur_group_id(), width = 4, pad = "0"), .before = year) |>
  ungroup()

# Validate raw data
validate_raw_data(congressional_elections_raw, "Raw Congressional Elections Data")

# Filter to major party candidates (Democrats and Republicans) in congressional races
message("Filtering to major party candidates...")
congressional_elections_clean <- congressional_elections_raw |>
  select(cong_election_id, year, state_po, district, candidate, party, candidatevotes, totalvotes) |>
  rename(state = state_po) |>
  filter(detect_party(party, "democrat", "republican")) |>
  mutate(
    party = case_when(
      str_detect(tolower(party), "^d") ~ "D",
      str_detect(tolower(party), "^r") ~ "R",
    )
  )

# Validate party filtering
validate_party_data(congressional_elections_clean, "After Party Filtering")
validate_election_coverage(congressional_elections_clean, "After Party Filtering")

# Handle special congressional election cases (exclusions)
message("Loading and applying exclusions...")
congressional_election_exclusions <- read_csv(here(elections_data_dir, "missing_elections_notated.csv")) |>
  select(id, candidate, exclude)

message("Exclusions loaded: ", nrow(congressional_election_exclusions), " entries")
message("Exclusions to apply: ", sum(congressional_election_exclusions$exclude == 1, na.rm = TRUE), " candidates")

# Apply exclusions to congressional election dataset
congressional_elections_clean <- congressional_elections_clean |>
  left_join(congressional_election_exclusions, by = c("cong_election_id" = "id", "candidate")) |>
  mutate(exclude = case_when(is.na(exclude) ~ 0, TRUE ~ exclude)) |>
  filter(exclude != 1) |>
  select(-exclude)

# Validate after exclusions
validate_election_coverage(congressional_elections_clean, "After Exclusions")

# Calculate congressional vote margins and competitiveness
message("Calculating vote margins and competitiveness...")
congressional_vote_margins <- congressional_elections_clean |>
  rename(district_total_votes = totalvotes) |>
  group_by(cong_election_id) |>
  mutate(major_party_total_votes = sum(candidatevotes)) |>
  group_by(cong_election_id, party) |>
  mutate(party_total_votes = sum(candidatevotes)) |>
  ungroup() |>
  distinct(cong_election_id, year, party, state, district, party_total_votes) |>
  pivot_wider(names_from = party, values_from = party_total_votes) |>
  mutate(
    across(D:R, ~replace_na(., 0)),
    dem_vote_share = D / (D + R) * 100, 
    rep_vote_share = R / (D + R) * 100
  ) |>
  rename(dem_total_votes = D, rep_total_votes = R, cong_district_election_id = cong_election_id, election_year = year) |>
  mutate(
    vote_margin = abs(dem_vote_share - rep_vote_share),
    is_competitive_race = case_when(vote_margin <= 5 ~ 1, TRUE ~ 0)
  )

# Validate final vote margins
validate_vote_margins(congressional_vote_margins, "Final Vote Margins")

# Print final validation summary
print_validation_summary()

# Additional specific check for negative votes (debugging)
negative_vote_races <- congressional_vote_margins |>
  filter(rep_total_votes < 0 | dem_total_votes < 0)

if (nrow(negative_vote_races) > 0) {
  message("FINAL CHECK: Found ", nrow(negative_vote_races), " races with negative vote totals:")
  print(negative_vote_races %>% select(election_year, state, district, dem_total_votes, rep_total_votes))
  message("These likely represent uncontested races or data quality issues.")
}

# Adjust names to avoid confusion with other election data
congressional_vote_margins <- congressional_vote_margins %>%
  rename(
    dem_total_votes_cong = dem_total_votes,
    rep_total_votes_cong = rep_total_votes,
    margin_cong = vote_margin,
    dem_vote_share_cong = dem_vote_share,
    rep_vote_share_cong = rep_vote_share,
    is_competitive_cong = is_competitive_race
  )

# Save final congressional elections dataset
message("Saving final dataset to: ", output_dir)
write_csv(congressional_vote_margins, output_dir)
message("Data processing complete. Final dataset contains ", nrow(congressional_vote_margins), " races.")